{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Basic Statistics"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Correlation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.ml.linalg import Vectors\n",
    "from pyspark.ml.stat import Correlation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]),),\n",
    "        (Vectors.dense([4.0, 5.0, 0.0, 3.0]),),\n",
    "        (Vectors.dense([6.0, 7.0, 0.0, 8.0]),),\n",
    "        (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]),)]\n",
    "df = spark.createDataFrame(data, [\"features\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "Vectors.sparse(4, [(0, 1.0), (3, -2.0)]).toArray()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Pearson correlation matrix:\n",
      "DenseMatrix([[ 1.        ,  0.05564149,         nan,  0.40047142],\n",
      "             [ 0.05564149,  1.        ,         nan,  0.91359586],\n",
      "             [        nan,         nan,  1.        ,         nan],\n",
      "             [ 0.40047142,  0.91359586,         nan,  1.        ]])\n"
     ]
    }
   ],
   "source": [
    "r1 = Correlation.corr(df, \"features\").head()\n",
    "print(\"Pearson correlation matrix:\\n\" + str(r1[0]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Spearman correlation matrix:\n",
      "DenseMatrix([[ 1.        ,  0.10540926,         nan,  0.4       ],\n",
      "             [ 0.10540926,  1.        ,         nan,  0.9486833 ],\n",
      "             [        nan,         nan,  1.        ,         nan],\n",
      "             [ 0.4       ,  0.9486833 ,         nan,  1.        ]])\n"
     ]
    }
   ],
   "source": [
    "r2 = Correlation.corr(df, \"features\", \"spearman\").head()\n",
    "print(\"Spearman correlation matrix:\\n\" + str(r2[0]))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Hypothesis testing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.ml.linalg import Vectors\n",
    "from pyspark.ml.stat import ChiSquareTest"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = [(0.0, Vectors.dense(0.5, 10.0)),\n",
    "        (0.0, Vectors.dense(1.5, 20.0)),\n",
    "        (1.0, Vectors.dense(1.5, 30.0)),\n",
    "        (0.0, Vectors.dense(3.5, 30.0)),\n",
    "        (0.0, Vectors.dense(3.5, 40.0)),\n",
    "        (1.0, Vectors.dense(3.5, 40.0))]\n",
    "df = spark.createDataFrame(data, [\"label\", \"features\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "r = ChiSquareTest.test(df, \"features\", \"label\").head()\n",
    "print(\"pValues: \" + str(r.pValues))\n",
    "print(\"degreesOfFreedom: \" + str(r.degreesOfFreedom))\n",
    "print(\"statistics: \" + str(r.statistics))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Apache Toree - PySpark",
   "language": "python",
   "name": "apache_toree_pyspark"
  },
  "language_info": {
   "codemirror_mode": "text/x-ipython",
   "file_extension": ".py",
   "mimetype": "text/x-ipython",
   "name": "python",
   "pygments_lexer": "python",
   "version": "3.6.3\n"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
